Author: Javier Dolado. Collaborator: Daniel Rodriguez
This is the R Markdown document for the talk “Data Analysis in Software Engineering”, given at the University of Alcalá on Tuesday, 9th of February, 2016. This document requires the R system and the Rstudio installed. This document executes all chunks of R code and generates an html document.
The purpose of the talk is to give a hands-on experience of a data analysis in software engineering. We use basic statistical procedures for analysing some public software engineering datasets.
This is an R Markdown presentation. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.
Important: for compiling this document you need to change the paths to your folders where you have downloaded the content of https://github.com/javierdolado/————–, i.e, change */home/javier/DocProjects/PRESI2013/————-.
# install.packages("rmarkdown")
# install.packages("knitr")
setwd("/home/javier/DocProjects/PRESI2013/icebergFeb2016")
will install the package rmarkdown on your R system.
main.title <- "Area within 2 SD of the mean"
par(mfrow=c(1,2))
plot(function(x) dnorm(x, mean = 0, sd = 1),
xlim = c(-3, 3), main = "SD 1", xlab = "x",
ylab = "", cex = 2)
segments(-2, 0, -2, 0.4)
segments(2, 0, 2, 0.4)
plot(function(x) dnorm(x, mean = 0, sd = 4),
xlim = c(-12, 12), main = "SD 4", xlab = "x",
ylab = "", cex = 2)
segments(-8, 0, -8, 0.1)
segments(8, 0, 8, 0.1)
sample.means <- rep(NA, 1000)
for (i in 1:1000) {
sample.40 <- rnorm(40, mean = 60, sd = 4)
sample.means[i] <- mean(sample.40)
}
means40 <- mean(sample.means)
sd40 <- sd(sample.means)
means40
## [1] 59.99446
sd40
## [1] 0.6344591
hist(sample.means)
options(digits=3)
path2files <- "~/DocProjects/PRESI2013/icebergFeb2016" # Set here the path to the folder where you have downloaded the files
setwd(path2files) #this command sets the default directory
telecom1 <- read.table("Telecom1.csv", sep=",",header=TRUE, stringsAsFactors=FALSE, dec = ".") #read data
summary(telecom1)
## size effort EstTotal
## Min. : 3.0 Min. : 24 Min. : 30
## 1st Qu.: 37.2 1st Qu.: 119 1st Qu.:142
## Median : 68.5 Median : 222 Median :289
## Mean :100.3 Mean : 284 Mean :320
## 3rd Qu.:164.0 3rd Qu.: 352 3rd Qu.:472
## Max. :284.0 Max. :1116 Max. :777
par(mfrow=c(1,2)) #two figures per row
size_telecom1 <- telecom1$size
effort_telecom1 <- telecom1$effort
hist(size_telecom1, col="blue")
hist(effort_telecom1, col="blue")
boxplot(size_telecom1)
boxplot(effort_telecom1)
par(mfrow=c(1,1))
qqnorm(size_telecom1, main="Q-Q Plot of 'size'")
qqline(size_telecom1, col=2, lwd=2, lty=2) #draws a line through the first and third quartiles
qqnorm(effort_telecom1, main="Q-Q Plot of 'effort'")
qqline(effort_telecom1)
- We observe the non-normality of the data.
plot(size_telecom1, effort_telecom1)
library(foreign)
china <- read.arff("china.arff")
china_size <- china$AFP
summary(china_size)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9 100 215 487 438 17500
china_effort <- china$Effort
summary(china_effort)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26 704 1830 3920 3830 54600
par(mfrow=c(1,2))
hist(china_size, col="blue", xlab="Adjusted Function Points", main="Distribution of AFP")
hist(china_effort, col="blue",xlab="Effort", main="Distribution of Effort")
boxplot(china_size)
boxplot(china_effort)
qqnorm(china_size)
qqline(china_size)
qqnorm(china_effort)
qqline(china_effort)
- We observe the non-normality of the data.
par(mfrow=c(1,1))
plot(china_size,china_effort)
cor(china_size,china_effort)
## [1] 0.685
cor.test(china_size,china_effort)
##
## Pearson's product-moment correlation
##
## data: china_size and china_effort
## t = 20, df = 500, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.635 0.729
## sample estimates:
## cor
## 0.685
cor(china_size,china_effort, method="spearman")
## [1] 0.649
cor(china_size,china_effort, method="kendall")
## [1] 0.468
First proposed many years ago. But still very useful …
The method used to choose the values \(b_0\) and \(b_1\) is to minimize the sum of the squares of the residual errors.
# library(UsingR); data(galton)
par(mfrow=c(1,2))
hist(galton$child,col="blue",breaks=100)
hist(galton$parent,col="blue",breaks=100)
plot(galton$parent,galton$child,pch=1,col="blue", cex=0.4)
lm1 <- lm(galton$child ~ galton$parent)
lines(galton$parent,lm1$fitted,col="red",lwd=3)
plot(galton$parent,lm1$residuals,col="blue",pch=1, cex=0.4)
abline(c(0,0),col="red",lwd=3)
qqnorm(galton$child)
linmodel_logchina <- lm(logchina_effort ~ logchina_size)
par(mfrow=c(1,1))
plot(logchina_size, logchina_effort)
abline(linmodel_logchina, lwd=3, col=3)
par(mfrow=c(1,2))
plot(linmodel_logchina, ask = FALSE)
linmodel_logchina
##
## Call:
## lm(formula = logchina_effort ~ logchina_size)
##
## Coefficients:
## (Intercept) logchina_size
## 3.301 0.768
chinaTrain <- read.arff("china3AttSelectedAFPTrain.arff")
nrow(chinaTrain)
## [1] 332
logchina_size <- log(chinaTrain$AFP)
logchina_effort <- log(chinaTrain$Effort)
linmodel_logchina_train <- lm(logchina_effort ~ logchina_size)
par(mfrow=c(1,1))
plot(logchina_size, logchina_effort)
abline(linmodel_logchina_train, lwd=3, col=4)
par(mfrow=c(1,2))
plot(linmodel_logchina_train, ask = FALSE)
linmodel_logchina_train
##
## Call:
## lm(formula = logchina_effort ~ logchina_size)
##
## Coefficients:
## (Intercept) logchina_size
## 3.249 0.784
gm_mean = function(x, na.rm=TRUE){
exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))}
chinaTest <- read.arff("china3AttSelectedAFPTest.arff")
b0 <- linmodel_logchina_train$coefficients[1]
b1 <- linmodel_logchina_train$coefficients[2]
china_size_test <- chinaTest$AFP
actualEffort <- chinaTest$Effort
predEffort <- exp(b0+b1*log(china_size_test))
err <- actualEffort - predEffort #error or residual
ae <- abs(err)
hist(ae, main="Absolute Error in the China Test data")
mar <- mean(ae)
mre <- ae/actualEffort
mmre <- mean(mre)
mdmre <- median(mre)
gmar <- gm_mean(ae)
mar
## [1] 1867
mmre
## [1] 1.15
mdmre
## [1] 0.551
gmar
## [1] 833
level_pred <- 0.25 #below and above (both)
lowpred <- actualEffort*(1-level_pred)
uppred <- actualEffort*(1+level_pred)
pred <- predEffort <= uppred & predEffort >= lowpred #pred is a vector with logical values
Lpred <- sum(pred)/length(pred)
Lpred
## [1] 0.186
samplesize <- floor(0.66*nrow(telecom1))
set.seed(012) # to make the partition reproducible
train_idx <- sample(seq_len(nrow(telecom1)), size = samplesize)
telecom1_train <- telecom1[train_idx, ]
telecom1_test <- telecom1[-train_idx, ]
par(mfrow=c(1,1))
# transformation of variables to log-log
xtrain <- log(telecom1_train$size)
ytrain <- log(telecom1_train$effort)
lmtelecom1 <- lm( ytrain ~ xtrain)
plot(xtrain, ytrain)
abline(lmtelecom1, lwd=2, col="blue")
b0_tel1 <- lmtelecom1$coefficients[1]
b1_tel1 <- lmtelecom1$coefficients[2]
# calculate residuals and predicted values
res <- signif(residuals(lmtelecom1), 5)
xtest <- telecom1_test$size
ytest <- telecom1_test$effort
pre_tel1 <- exp(b0+b1*log(xtest))
# plot distances between points and the regression line
plot(xtest, ytest)
curve(exp(b0+b1*log(x)), from=0, to=300, add=TRUE, col="blue", lwd=2)
segments(xtest, ytest, xtest, pre_tel1, col="red")
par(mfrow=c(1,1))
lmtelecom <- lm(effort_telecom1 ~ size_telecom1)
plot(size_telecom1, effort_telecom1)
abline(lmtelecom, lwd=3, col="blue")
# calculate residuals and predicted values
res <- signif(residuals(lmtelecom), 5)
predicted <- predict(lmtelecom)
# plot distances between points and the regression line
segments(size_telecom1, effort_telecom1, size_telecom1, predicted, col="red")
level_pred <- 0.25 #below and above (both)
lowpred <- effort_telecom1*(1-level_pred)
uppred <- effort_telecom1*(1+level_pred)
predict_inrange <- predicted <= uppred & predicted >= lowpred #pred is a vector with logical values
Lpred <- sum(predict_inrange)/length(predict_inrange)
Lpred
## [1] 0.444
#Visually plot lpred
segments(size_telecom1, lowpred, size_telecom1, uppred, col="red", lwd=3)
err_telecom1 <- abs(effort_telecom1 - predicted)
mar_tel1 <- mean(err_telecom1)
mar_tel1
## [1] 125
estimEffChinaTest <- predEffort # This will be overwritten, no problem
numruns <- 9999
randguessruns <- rep(0, numruns)
for (i in 1:numruns) {
for (j in 1:length(estimEffChinaTest)) {
estimEffChinaTest[j] <- sample(actualEffort[-j],1)}#replacement with random guessingt
randguessruns[i] <- mean(abs(estimEffChinaTest-actualEffort))
}
marp0Chinatest <- mean(randguessruns)
marp0Chinatest
## [1] 3956
hist(randguessruns, main="MARP0 distribution of the China dataset")
saChina = (1- mar/marp0Chinatest)*100
saChina
## [1] 52.8
path2files <- "~/DocProjects/PRESI2013/icebergFeb2016"
setwd(path2files)
telecom1 <- read.table("Telecom1.csv", sep=",",header=TRUE, stringsAsFactors=FALSE, dec = ".") #read data
#par(mfrow=c(1,2))
#size <- telecom1[1]$size not needed now
actualEffTelecom1 <- telecom1[2]$effort
estimEffTelecom1 <- telecom1[3]$EstTotal # this will be overwritten
numruns <- 9999
randguessruns <- rep(0, numruns)
for (i in 1:numruns) {
for (j in 1:length(estimEffTelecom1)) {
estimEffTelecom1[j] <- sample(actualEffTelecom1[-j],1)}#replacement with random guessingt
randguessruns[i] <- mean(abs(estimEffTelecom1-actualEffTelecom1))
}
marp0telecom1 <- mean(randguessruns)
marp0telecom1
## [1] 271
hist(randguessruns, main="MARP0 distribution of the Telecom1 dataset")
saTelecom1 <- (1- mar_tel1/marp0telecom1)*100
saTelecom1
## [1] 54
## [1] 281
set.seed(10)
norsim(sims = 100, n = 36, mu = 100, sigma = 18, conf.level = 0.95)
library(boot)
hist(ae, main="Absolute Errors of the China Test data")
level_confidence <- 0.95
repetitionsboot <- 9999
samplemean <- function(x, d){return(mean(x[d]))}
b_mean <- boot(ae, samplemean, R=repetitionsboot)
confint_mean_China <- boot.ci(b_mean)
## Warning in boot.ci(b_mean): bootstrap variances needed for studentized
## intervals
confint_mean_China
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 9999 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = b_mean)
##
## Intervals :
## Level Normal Basic
## 95% (1415, 2309 ) (1389, 2282 )
##
## Level Percentile BCa
## 95% (1451, 2345 ) (1488, 2411 )
## Calculations and Intervals on Original Scale
boot_geom_mean <- function(error_vec){
log_error <- log(error_vec[error_vec > 0])
log_error <-log_error[is.finite(log_error)] #remove the -Inf value before calculating the mean, just in case
samplemean <- function(x, d){return(mean(x[d]))}
b <- boot(log_error, samplemean, R=repetitionsboot) # with package boot
# this is a boot for the logs
return(b)
}
# BCAconfidence interval for the geometric mean
BCAciboot4geommean <- function(b){
conf_int <- boot.ci(b, conf=level_confidence, type="bca")$bca #following 10.9 of Ugarte et al.'s book
conf_int[5] <- exp(conf_int[5]) # the boot was computed with log. Now take the measure back to its previous units
conf_int[4] <- exp(conf_int[4])
return (conf_int)
}
# this is a boot object
b_gm <- boot_geom_mean(ae) #"ae" is the absolute error in the China Test data
print(paste0("Geometric Mean of the China Test data: ", round(exp(b_gm$t0), digits=3)))
## [1] "Geometric Mean of the China Test data: 832.55"
b_ci_gm <- BCAciboot4geommean(b_gm)
print(paste0("Confidence Interval: ", round(b_ci_gm[4], digits=3), " - ", round(b_ci_gm[5], digits=3)))
## [1] "Confidence Interval: 675.783 - 1012.201"
# Make a % confidence interval bca
# BCAciboot <- function(b){
# conf_int <- boot.ci(b, conf=level_confidence, type="bca")$bca #following 10.9 of Ugarte et al.'s book
# return (conf_int)
# }